Load required libraries and functions

if (!require("pacman")) install.packages("pacman")
pacman::p_load("knitr", "dplyr", "scales", "car", "doParallel", "ggpubr",
               "mediation","tibble","stringr","ggplot2","psych","lavaan",
               "semTools","ggrepel","vegan","phyloseq","FactoMineR","factoextra",
               "gplots","corrplot","ggvegan",
               install = TRUE)
library("MicrobiomeAnalystR")
## Change mediation function to print out up to 3 decimals for mediation

trace(mediation:::print.summary.mediate, 
      at = 11,
      tracer = quote({
        printCoefmat <- function(x, digits) {
          p <- x[, 4] #p-values seem to be stored rounded
          x[, 1:3] <- sprintf("%.5f", x[, 1:3])
          x[, 4] <- sprintf("%.3f", p)
          x[, 4] <- paste(p,gtools::stars.pval(p), sep = " ")
          print(x, quote = FALSE, left = TRUE)
        } 
      }),
      print = FALSE)
## [1] "print.summary.mediate"
## Flatten the correlation matrix from upper or lower triangle into data.frame

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    coeff  =(cormat)[ut],
    p_value = pmat[ut]
  )
}

## Seed
set.seed(123)

Load datasets

abundance.in <- read.delim("tsi_abundance.txt", check.names = F, sep = "\t")
metadata <- read.delim("tsi_metadata.txt", check.names = F)
abundance.spp <- read.delim("yarrabah_metaphlan_counts_species.tsv", check.names = F, row.names = 1, header = T)# Metaphlan only species-level

Explote MetaPhlan2 abundance estimates for the TSI

We used the MicrobiomeAnalystR package to explore and visualize the taxonomic the data by Island This was a quick way to identify any patterns in the data before embarking on time-consuming analyses

mbSet<-Init.mbSetObj()
## Warning in Cairo::CairoFonts("Arial:style=Regular", "Arial:style=Bold", :
## CairoFonts() has no effect on Windows. Please use par(family="...") to specify
## the desired font - see ?par.
## [1] "Init MicrobiomeAnalyst!"
mbSet<-SetModuleType(mbSet, "mdp")
mbSet<-ReadSampleTable(mbSet, "metadata_4_beta_diversity.txt");
## Warning in FUN(newX[, i], ...): NAs introduced by coercion
mbSet<-Read16SAbundData(mbSet, "metaphlan_abundance_all.txt","text","Others/Not_specific","T");
mbSet<-SanityCheckData(mbSet, "text");
## [1] "Sanity check passed!"
mbSet<-PlotLibSizeView(mbSet, "norm_species_count","png");
mbSet<-CreatePhyloseqObj(mbSet, "text","Others/Not_specific","T")
mbSet<-ApplyAbundanceFilter(mbSet, "prevalence", 0, 0.1);
mbSet<-ApplyVarianceFilter(mbSet, "iqr", 0.0);
mbSet<-PerformNormalization(mbSet, "none", "none", "none");
## Loading required package: memoise
mbSet<-PlotAlphaData(mbSet, "orig","alpha_diver_Chao","Chao1","location","OTU", "default", "png");
mbSet<-PlotAlphaBoxData(mbSet, "alpha_diverbox_Chao","Chao1","location","default", "png");
## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.
mbSet<-PerformAlphaDiversityComp(mbSet, "tt","location");
## [1] "p-value: 0.026794; [T-test] statistic: -2.2504"
mbSet<-PlotAlphaData(mbSet, "orig","alpha_diver_Shannon","Shannon","location","OTU", "default", "png");
mbSet<-PlotAlphaBoxData(mbSet, "alpha_diverbox_Shannon","Shannon","location","default", "png");
## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.
mbSet<-PerformAlphaDiversityComp(mbSet, "tt","location");
## [1] "p-value: 0.035256; [T-test] statistic: -2.1355"
mbSet<-PlotAlphaData(mbSet, "orig","alpha_diver_Shannon_spp","Shannon","location","Species", "default", "png");
mbSet<-PlotAlphaBoxData(mbSet, "alpha_diverbox_Shannon_spp","Shannon","location","default", "png");
## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.
mbSet<-PerformAlphaDiversityComp(mbSet, "tt","location");
## [1] "p-value: 0.017406; [T-test] statistic: -2.4193"
mbSet<-PlotAlphaData(mbSet, "orig","alpha_diver_Chao_spp","Chao1","location","Species", "default", "png");
mbSet<-PlotAlphaBoxData(mbSet, "alpha_diverbox_Chao_spp","Chao1","location","default", "png");
## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.

## Warning: Use of `data$value` is discouraged. Use `value` instead.
mbSet<-PerformAlphaDiversityComp(mbSet, "tt","location");
## [1] "p-value: 0.0024939; [T-test] statistic: -3.1043"
mbSet<-PlotBetaDiversity(mbSet, "beta_diver_PCOA_spp","PCoA","bray","expfac","location","none","Species","Not_Assigned","Observed", "yes", "png", 72, "default");
mbSet<-PlotBetaDiversity(mbSet, "beta_diver_NMDS_spp","NMDS","bray","expfac","location","none","Species","Not_Assigned","Observed", "yes", "png", 72, "default");
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.2223184 
## Run 1 stress 0.2220119 
## ... New best solution
## ... Procrustes: rmse 0.04591519  max resid 0.3680259 
## Run 2 stress 0.2214421 
## ... New best solution
## ... Procrustes: rmse 0.02275082  max resid 0.1939031 
## Run 3 stress 0.2353974 
## Run 4 stress 0.2255925 
## Run 5 stress 0.2253363 
## Run 6 stress 0.2308619 
## Run 7 stress 0.2227583 
## Run 8 stress 0.2415948 
## Run 9 stress 0.2217357 
## ... Procrustes: rmse 0.02934804  max resid 0.1929993 
## Run 10 stress 0.2637086 
## Run 11 stress 0.2394155 
## Run 12 stress 0.2329377 
## Run 13 stress 0.2246119 
## Run 14 stress 0.2323597 
## Run 15 stress 0.2324664 
## Run 16 stress 0.2278253 
## Run 17 stress 0.2367187 
## Run 18 stress 0.2304623 
## Run 19 stress 0.2378818 
## Run 20 stress 0.2385042 
## *** No convergence -- monoMDS stopping criteria:
##      1: no. of iterations >= maxit
##     19: stress ratio > sratmax
Raw NMDS Fig on first attempt in MicrobiomeAnalysitR

Raw NMDS Fig on first attempt in MicrobiomeAnalysitR

Diversity and Richness using vegan package in R

abundance <- column_to_rownames(abundance.in, var = "ID")
abundance <- abundance[!(apply(abundance, 1, function(y) all(y == 0.000))),] %>% 
  round(., 4) %>% sweep(., 1, rowSums(abundance), '/')
spp.m <- data.matrix(abundance)

## Shannon diversity must define that its the diversity function from vegan to be used
shanno_vegan <- as.data.frame(vegan::diversity(spp.m, index = "shannon", MARGIN = 1, base = exp(1)))%>%
  rownames_to_column(var = "Species")

## Shannon evenness
x <- as.data.frame(t(spp.m))
x$taxonomy <- rownames(x)

## library(RAM)
shannon_evenness <- data.frame(RAM::evenness(data=list(x=x), index="shannon"), check.names=FALSE) 
## Registered S3 method overwritten by 'labdsv':
##   method       from
##   summary.dist ade4
shannon_evenness <- as.data.frame(t(shannon_evenness)) %>%
  rownames_to_column(var = "Species")
# colnames(shannon_evenness) <- gsub("^X", "",  colnames(shannon_evenness))
## Chao1 species richness
chao1 <- data.frame(estimateR(abundance.spp))
chao1 <- as.data.frame(t(chao1)) %>% 
  rownames_to_column(var = "Species")
chao1 <- chao1[1:2]

## Simpson species diversity
simpson_vegan <- as.data.frame(vegan::diversity(spp.m, index = "simpson", MARGIN = 1, base = exp(1)))%>% 
  rownames_to_column(var = "Species")

## Pielou's evenness
pielou <- data.frame(vegan::diversity(spp.m, index = "shannon", MARGIN = 1, base = exp(1))/log(specnumber(spp.m)))%>%
  rownames_to_column(var = "Species")

## Bray-Curtis dissimilarity index
bray <- vegan::vegdist(t(spp.m), method = "bray", na.rm = T)
m <- data.frame(t(combn(colnames(spp.m),2)), as.numeric(bray))
names(m) <- c("c1", "c2", "bray_distance")
write.table(m, "bray_curtisMatrix.tsv", sep = "\t", row.names = F)

## Combine Alpha Measure
c <- plyr::join_all(list(pielou, shanno_vegan, shannon_evenness, simpson_vegan, chao1), by = "Species", type='left')
colnames(c) <- c("pielou_evenness", "shannon_diversity", "shannon_evenness", "simpson_diversity", "chao_richness")
# kable(head(c, 10))
write.table(c, "diversity_evenness.tsv", sep = "\t", col.names = T)

The PERMANOVA stats on this dataset

md <- subset(metadata, select = c("ID", "Site")) %>% `rownames<-` (NULL) %>% tibble::column_to_rownames(.,var = "ID")
permanova <-adonis2((abundance.in %>%
           tibble::column_to_rownames(.,var = "ID") %>%
              t()) ~ Site, 
        data = md, 
        permutations = 10000, 
        method = "bray")
permanova
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 10000
## 
## adonis2(formula = (abundance.in %>% tibble::column_to_rownames(., var = "ID") %>% t()) ~ Site, data = md, permutations = 10000, method = "bray")
##          Df SumOfSqs      R2      F    Pr(>F)    
## Site      1   0.7521 0.02863 2.8886 9.999e-05 ***
## Residual 98  25.5148 0.97137                     
## Total    99  26.2669 1.00000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Of course, there is a considerable difference in multivariate spread that could be the cause for the significant PERMANOVA result.

The NMDS plot needed some work!

We took the data from the above analysis and made some cute plots in R

df_all<- read.csv("NMDS_primer.csv", row.names = 1) ## in dropbox TSI_analysis from primer7
centroids_all <- aggregate(cbind(NMDS1,NMDS2)~Site,df_all,mean)
##Link all dots to centroid
gg_all <- merge(df_all,aggregate(cbind(mean.X=NMDS1,mean.Y=NMDS2)~Site,df_all,mean),by="Site")
##PLOTS
#Test - With dots and lines
ggplot(data=gg_all, mapping=aes(NMDS1,NMDS2,color=factor(Site)))+
  geom_point(size=1.5)+
  stat_ellipse(aes(NMDS1,NMDS2,color=factor(Site)),type = "norm")#+

  geom_segment(aes(mean.X, mean.Y, xend=NMDS1, yend=NMDS2))
## mapping: x = ~mean.X, y = ~mean.Y, xend = ~NMDS1, yend = ~NMDS2 
## geom_segment: arrow = NULL, arrow.fill = NULL, lineend = butt, linejoin = round, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
#With dots, lines, removed background, axis lines (http://docs.ggplot2.org/dev/vignettes/themes.html)
cols_DF <- c('#00007fff','#94631eff')
(all<- ggplot(data=gg_all, mapping=aes(NMDS1,NMDS2,color=factor(Site), fill = factor(Site))) + 
    theme_linedraw()+
    stat_ellipse(aes(NMDS1,NMDS2, color=factor(Site)), geom = "polygon", alpha = 1/8, type = "norm", linetype = 2, size = 0.5)+
    geom_vline(xintercept = 0, colour="darkgrey", linetype = 2) +
    geom_hline(yintercept = 0, colour="darkgrey", linetype = 2) +
    theme(axis.title = element_text(size=16),
          legend.title = element_text(size = 16),
          axis.text = element_blank(),
          axis.ticks = element_blank()) +
    geom_point(size=3) +
    scale_color_manual(name = "Site",values=c('#00007fff','#94631eff')) +
    scale_fill_manual(name = "Site", values=c('#00007fff','#94631eff')) + 
    theme_bw())

ggsave(all, filename = "NMDS.svg", height = 8, width = 10)

After some parameter fine-tuning (see manuscript M&M for exact final parameters) in R and beautification in GraphPad prism7 or Adobe Illustrator, the a) \(\alpha\) diversity on unfiltered species data and b) \(\beta\) diversity on filtered data looked like this:

Original Figure 1 NMDS

Original Figure 1 NMDS

Original Figure 1 PCoA

Original Figure 1 PCoA

P-Values in \(\alpha\)-diversity are ANOVA derived (See ANOVA below)

These data was also augmeneted with some Spearman correlations between metadata and species relative abundances

Canonical correspondence analysis

The species abundance and metadata were used to perform CCA for each island

abundance <- abundance.spp[!(apply(abundance.spp, 1, function(y) all(y == 0.000))),]
abundance <- round(abundance, 4)
abundance <- abundance[rowSums(abundance == 0) <= 70, ]
abundance <- sweep(abundance, 1, rowSums(abundance), '/')

meta <- meta[,c(1,2,6,12,13,15:19)]
names <- paste0(substr(rownames(abundance), 0, 1), ". ",stringr::word(rownames(abundance),2,-1,sep = "\\_"))
x_nms <- rownames(abundance)
for (i in 1:length(names)){
  print(paste0(i," ", names[i]))
  if (grepl("unclass",names[i])){
    names[[i]] <- paste0((stringr::word(x_nms[[i]],1,sep = "\\_")),"*")
  }
}
rownames(abundance) <- names

tsi_ccpna <- cca(X = t(abundance),Y = meta)
permutest(tsi_ccpna, permutations = 9999,model = "reduced", alpha =0.05)
anova(tsi_ccpna, perm.max=9999, alpha =0.05, beta = 0.01)

plot.new()
# svg("CCA_combined.svg", height = 7, width = 14)
par(mfrow=c(1,2))
##sites
plot(tsi_ccpna,choices=c(1,2),display=c("wa","bp"),type="points",scaling="species", col = "black",
     xlim=c(-4,4),ylim=c(-4,4),
     main = "", cex.lab=1.6,cex.axis=1.5, cex.sub=1.8, pch=19, cex = 2)
points(tsi_ccpna,disp="sites",pch=20, col= rev(colvec[as.factor(x$study_site)]),cex=1.3, scaling="species")
legend("topleft",title="Island", legend=levels(as.factor(x$study_site)), bty="n",col=colvec,pch=21,pt.bg=colvec,cex = 1.8)
##species
plot(tsi_ccpna,choices=c(1,2),display= "species", type="points",scaling="none",
     xlim=c(-4,4),ylim=c(-4,4), col="grey",
     main = "", cex.lab=1.6,cex.axis=1.5, cex.sub=1.8, pch=19, ylab = "")
points(tsi_ccpna,disp="species",pch=20, col="grey2",bg = "grey2", cex=1.3, scaling="sites")
text(tsi_ccpna,"species",pos=2,axis.bp=TRUE, cex=1, scaling="none", col="black", font = 1.5)

dev.off()

After some manual cleaning!

CCA PLOT

CCA PLOT

We sought to identify these patterns using Distance-based redundancy analysis in PRIMER7

Spearman and CCA Figure 3

Spearman and CCA Figure 3

We also cleaned the svg files generated from LEfSe and MetaPhlAn2>-<a href=“https://huttenhower.sph.harvard.edu/graphlan” target="_blank" pipelines

The MetaPhlAn Graphlan plot plots for this data

The MetaPhlAn Graphlan plot plots for this data

BY ISLAND

BY ISLAND

LEfSe plot for this data

LEfSe plot for this data

Metadata exploration

Normality test on the metadata

Shapiro-Wilk test is normally recommend as the best choice for omnibus testing of data normality. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3693611/. This test is more powerful than Lillifors, Kolmogorov-Smirnove, Anderson-Darling and other tests for small data-size. If the test is significant, the distribution is non-normal. For extrmely small sample sizes (e.g. n<10), the following steps may help in detecting outliers: 1. Fit the model (e.g. non-linear) to the raw data. 2. Test residuals for normality with appropriate robust test (e.g. Shapiro-Wilk test). 3. If residuals are normal, use parametric bootstrapping to estimate model parameter confidence intervals. If not, use nonparametric bootstrapping.

NB: D’Agostino-Pearson (\(K^{2}\)) test has very poor power against (graphically and statistically non-normal) data whose skewness and kurtosis are similar to those of a normally distributed data.

data.shapiro <- column_to_rownames(metadata, var = "ID")

data.shapiro.sub <- subset(data.shapiro, select = !(grepl("SML", names(data.shapiro))))
data.shapiro.sub <- subset(data.shapiro.sub, select = !names(data.shapiro.sub) %in% 
                             c("Hypertension", "aPHQ9", "dbp","Diabetes", "Gender","simpson_diversity","pielou_evenness" ))
colnames <- dimnames(data.shapiro.sub)[[2]]

par(mfrow=c(2, 3))
for (i in 1:ncol(data.shapiro.sub)) {
    df <- as.vector(na.omit(data.shapiro.sub[,i]))
    p <- round(shapiro.test(df)$p.value, digits = 4)
    w <- round(shapiro.test(df)$statistic[[1]][1], digits = 4)
    hist(df,  main=paste0("ID=",colnames[i],"; n=",length(df),"\nShapiro-Wilk:\n[p-value=",p,"; W=",w,"]"), probability=TRUE, col="gray", border="white")
    lines(density(df), lwd = 2, col = "chocolate3")
}

Based on this analysis, we’ve observed that some observations will need to be log transformed before any subsequence analysis. The histograms also allow us to make decisions on the skewness and kurtosis of the data independednt of the Shapiro-Wilk p-value

Anova, ancova, and Mann-Whitney U test

ANOVA

dat.anova <- data.shapiro.sub

## Biomarkers data to be transformed..
trans.biomark <- c("rbg","TNFalpha","CRP","HbA1cIFCC","IFN_gamma","IL10","IL12p40","IL12p70","IL13","IL15","IL17A","IL18","IL1beta","IL2","IL33","IL4","IL5","IL6","LBP","MCP1","MIP1alpha")
dat.anova[trans.biomark] <- lapply(dat.anova[trans.biomark],function(p) {log10(p+1)}) ##log biomarkers not normally distributed..
dat.anova.log <- do.call(data.frame,lapply(dat.anova, function(x) replace(x, is.infinite(x),NA)))

dat.descr.all <- round(psych::describe(dat.anova.log, IQR = T), digits = 2)
dat.descr.site <- psych::describeBy(dat.anova.log, dat.anova.log$Site, IQR = T)
dat.descr.site.df <- round(cbind(dat.descr.site$`1`, dat.descr.site$`2`), digits = 3)

# write.table(dat.descr.all, "results/metadata_stats.tsv", sep = "\t")
# write.table(dat.descr.site.df, "results/metadata_stats_by_site.tsv", sep = "\t")

AVz <- rep(NA, ncol(dat.anova.log))
# sink("Anova_unadjusted.doc")
for (i in 1:ncol(dat.anova.log)){
  column <- names(dat.anova.log[i])
  AVz <- summary(aov(dat.anova.log[,i] ~ Site, data = dat.anova.log))
  tk <- TukeyHSD((aov(dat.anova.log[,i] ~ factor(Site), data = dat.anova.log)))
  cat(paste0("ANOVA FOR: ",column,"\n\n"))
  print(AVz)
  cat(paste0("\nANOVA TukeyHSD TEST FOR: ",column))
  print(tk)
  cat("------------------------------------------------------------------------\n\n")
}
## ANOVA FOR: Age
## 
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## Site         1   2601  2601.0   9.293 0.00296 **
## Residuals   98  27430   279.9                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: Age  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##     diff      lwr      upr     p adj
## 2-1 10.2 3.559907 16.84009 0.0029579
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Alcohol
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1    4.1   4.138   1.134   0.597
## Residuals   95  346.6   3.649               
## 3 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: Alcohol  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff        lwr      upr     p adj
## 2-1 0.4131017 -0.3570307 1.183234 0.5966223
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: BMI
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1      1    0.94   0.018  0.893
## Residuals   98   5023   51.26               
## 
## ANOVA TukeyHSD TEST FOR: BMI  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##       diff       lwr      upr     p adj
## 2-1 -0.194 -3.035542 2.647542 0.8925062
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: CRP
## 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Site         1  3.917   3.917   32.96 1.63e-07 ***
## Residuals   80  9.507   0.119                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 18 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: CRP  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff       lwr       upr p adj
## 2-1 0.4418595 0.2886908 0.5950281 2e-07
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: SBP
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1   1641  1640.6   5.006 0.0275 *
## Residuals   97  31788   327.7                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: sbp  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##         diff       lwr      upr     p adj
## 2-1 8.142041 0.9196159 15.36447 0.0275467
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: MAP
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1   1327  1326.2   7.001 0.034 *
## Residuals   97  13719   141.4
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1               
## 1 observation deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: MAP  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff       lwr     upr    p adj
## 2-1 -7.0892517 -4.833924 4.65542 0.0342695
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Fruits
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1    0.2  0.2025   0.171   0.68
## Residuals   98  116.0  1.1841               
## 
## ANOVA TukeyHSD TEST FOR: Fruits  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##      diff       lwr      upr     p adj
## 2-1 -0.09 -0.521891 0.341891 0.6801182
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: HbA1cIFCC
## 
##             Df   Sum Sq   Mean Sq F value Pr(>F)
## Site         1 0.000289 0.0002890   2.463   0.1274
## Residuals   82 0.009622 0.0001173               
## 16 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: HbA1cIFCC  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff           lwr         upr     p adj
## 2-1 0.003719272 -0.0009951046 0.008433649 0.1274021
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IFN_gamma
## 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Site         1 0.2332 0.23318   14.65 0.000228 ***
## Residuals   98 1.5600 0.01592                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IFN_gamma  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr       upr     p adj
## 2-1 0.09657659 0.04650157 0.1466516 0.0002282
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL12p40
## 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Site         1 0.6786  0.6786   44.71 1.41e-09 ***
## Residuals   98 1.4876  0.0152                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IL12p40  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##         diff      lwr       upr p adj
## 2-1 0.164756 0.115857 0.2136551     0
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL12p70
## 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Site         1  1.838  1.8382   71.51 2.67e-13 ***
## Residuals   98  2.519  0.0257                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IL12p70  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff       lwr       upr p adj
## 2-1 0.2711572 0.2075262 0.3347881     0
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL13
## 
##             Df  Sum Sq  Mean Sq F value  Pr(>F)   
## Site         1 0.01313 0.013132   9.706 0.00241 **
## Residuals   98 0.13260 0.001353                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IL13  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff         lwr        upr     p adj
## 2-1 0.02291922 0.008319931 0.03751852 0.0024098
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL15
## 
##             Df  Sum Sq  Mean Sq F value Pr(>F)
## Site         1 0.00212 0.002116   0.763  0.281
## Residuals   98 0.27185 0.002774               
## 
## ANOVA TukeyHSD TEST FOR: IL15  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff         lwr        upr     p adj
## 2-1 0.009200122 -0.01170378 0.03010403 0.28105835
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL18
## 
##             Df Sum Sq Mean Sq F value  Pr(>F)    
## Site         1 0.0982 0.09815   19.24 2.9e-05 ***
## Residuals   98 0.4999 0.00510                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IL18  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff        lwr        upr   p adj
## 2-1 0.0626589 0.03431301 0.09100479 2.9e-05
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL1beta
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1  0.204 0.20429   2.507  0.534
## Residuals   63  5.134 0.08149               
## 35 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: IL1beta  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr       upr     p adj
## 2-1 -0.1122428 -0.2539046 0.0294189 0.5341804
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: IL4
## 
##             Df  Sum Sq  Mean Sq F value   Pr(>F)    
## Site         1 0.02106 0.021057   20.67 1.56e-05 ***
## Residuals   98 0.09983 0.001019                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: IL4  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr        upr    p adj
## 2-1 0.02902214 0.01635473 0.04168955 1.56e-05
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: LBP
## 
##             Df  Sum Sq Mean Sq F value Pr(>F)    
## Site         1 0.07699 0.07699   328.5 <2e-16 ***
## Residuals   98 0.02297 0.00023                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: LBP  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff        lwr         upr p adj
## 2-1 -0.05549275 -0.0615685 -0.04941699     0
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: MCP1
## 
##             Df  Sum Sq Mean Sq F value   Pr(>F)    
## Site         1 0.01205 0.01205   26.77 1.22e-06 ***
## Residuals   98 0.04413 0.00045                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: MCP1  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr        upr   p adj
## 2-1 0.02195831 0.01353594 0.03038067 1.2e-06
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: MIP1alpha
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1  0.235 0.23493    5.21 0.0248 *
## Residuals   92  4.148 0.04509                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 6 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: MIP1alpha  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr         upr     p adj
## 2-1 -0.1001889 -0.1873638 -0.01301391 0.0247594
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: RBG
## 
##             Df  Sum Sq  Mean Sq F value Pr(>F)  
## Site         1 0.00218 0.002183   3.804 0.0557 .
## Residuals   86 0.04936 0.000574                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: rbg  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff          lwr        upr     p adj
## 2-1 0.009985075 -0.000192629 0.02016278 0.05573964
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Seafood
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1   42.3   42.25   5.839 0.0175 *
## Residuals   98  709.1    7.24                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: Seafood  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##     diff       lwr      upr    p adj
## 2-1  1.3 0.2323549 2.367645 0.017528
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: activity_minspw
## 
##             Df  Sum Sq Mean Sq F value Pr(>F)
## Site         1   14161   14161   0.396  0.925
## Residuals   98 3504193   35757               
## 
## ANOVA TukeyHSD TEST FOR: activity_minspw_mod  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##     diff       lwr      upr     p adj
## 2-1 23.8 -51.25073 98.85073 0.9250609
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: waist
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1   1601  1601.0   3.926 0.019 *
## Residuals   94  38333   407.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 4 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: waist  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff       lwr        upr     p adj
## 2-1 -8.167454 -16.35192 0.01700881 0.019104683
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: waist_hip_ratio
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1 0.0553 0.05534   4.588 0.0348 *
## Residuals   94 1.1338 0.01206                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 4 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: waist_h_r  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff         lwr        upr     p adj
## 2-1 0.04805894 0.003508368 0.09260951 0.0347877
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Smoking_cig
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1      3    3.24   0.062  0.804
## Residuals   98   5150   52.55               
## 
## ANOVA TukeyHSD TEST FOR: Smoking_cig  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##     diff       lwr      upr     p adj
## 2-1 0.36 -2.517219 3.237219 0.8044233
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Sugar_drinks
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1    0.0  0.0025   0.001  0.576
## Residuals   98  163.8  1.6717               
## 
## ANOVA TukeyHSD TEST FOR: Sugar_drinks  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##     diff        lwr       upr     p adj
## 2-1 0.01 -0.5031579 0.5231579 0.57602308
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Takeaway
## 
##             Df Sum Sq Mean Sq F value Pr(>F)  
## Site         1   9.61    9.61   6.673 0.0113 *
## Residuals   98 141.14    1.44                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: Takeaway  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##      diff       lwr        upr     p adj
## 2-1 -0.62 -1.096306 -0.1436941 0.0112674
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: TNFalpha
## 
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Site         1 0.2985 0.29850   41.65 4.18e-09 ***
## Residuals   98 0.7024 0.00717                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: TNFalpha  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##          diff        lwr       upr p adj
## 2-1 0.1092707 0.07566997 0.1428715     0
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: Vegetables
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1   0.81   0.810   0.263  0.609
## Residuals   98 301.94   3.081               
## 
## ANOVA TukeyHSD TEST FOR: Vegetables  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##      diff        lwr       upr     p adj
## 2-1 -0.18 -0.8766606 0.5166606 0.6092886
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: WBC
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1    1.1   1.098    0.46  0.499
## Residuals   85  202.8   2.386               
## 13 observations deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: WBC  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr       upr     p adj
## 2-1 -0.2253723 -0.8860511 0.4353064 0.4994601
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: shannon_diversity
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1   0.00 0.00014   0.001   0.061  .
## Residuals   98  21.17 0.21598               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: shannon_diversity  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff        lwr       upr     p adj
## 2-1 0.002376945 -0.1820726 0.1868265 0.06123697
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: shannon_evenness
## 
##             Df Sum Sq  Mean Sq F value Pr(>F)
## Site         1 0.0063 0.006271   0.759  0.0273 *
## Residuals   98 0.8098 0.008263               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## ANOVA TukeyHSD TEST FOR: shannon_evenness  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##           diff        lwr        upr     p adj
## 2-1 0.01583734 -0.0202404 0.05191508 0.0273078
## 
## ------------------------------------------------------------------------
## 
## ANOVA FOR: chao_richness
## 
##             Df Sum Sq Mean Sq F value Pr(>F)
## Site         1      0    0.08   0.001  0.805
## Residuals   97  11511  118.67               
## 1 observation deleted due to missingness
## 
## ANOVA TukeyHSD TEST FOR: chao_richness  Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dat.anova.log[, i] ~ factor(Site), data = dat.anova.log)
## 
## $`factor(Site)`
##            diff       lwr      upr     p adj
## 2-1 -0.05755102 -4.403639 4.288536 0.80520866
## 
## ------------------------------------------------------------------------
# sink()

ANCOVA

Analysis was performed with adjustment for AGE

# Analysis of Covariance
## fit <- aov(y ~ A + x, data=mydataframe)

AVz <- rep(NA, ncol(dat.anova.log))
# sink("Ancova_adjusted_for_age_site.doc")
for (i in 1:ncol(dat.anova.log)){
  column <- names(dat.anova.log[i])
  AVz <- summary(aov(dat.anova.log[,i] ~ Site*Age, data = dat.anova.log))
  cat(paste0("ANCOVA FOR: ",column,"\n\n"))
  print(AVz)
  cat("------------------------------------------------------------------------\n\n")
}
# sink()

Mann-Whiteney Test

df <- data.frame(t(dat.anova.log), check.names = F)
pval_Utest <- data.table::as.data.table(apply(df, 1, function(x) {
    wilcox.test(x[1:50], x[51:100])$p.value}))

pval_Utest$ID <- rownames(df)
# write.table(pval_Utest, "man_whitney.tsv", sep = "\t")

Basic Exploratory Factor Analysis (EFA)

df <- dat.anova
dat.efa <- data.frame(scale(df, center=TRUE, scale=TRUE))
# psych::describe(dat.efa)

fa.parallel(x=dat.efa, fm="minres", fa="fa", main = "Minimal residual-MINRES") ## estimate number of factors to use in next step

## Parallel analysis suggests that the number of factors =  8  and the number of components =  NA
scree(dat.efa)

EFA.result <- factanal(~ ., data = dat.efa, 
                       factors=8,
                       rotation="varimax", 
                       na.action=na.exclude, 
                       scores = "regression", warnings = F) #note the formula specification allows NA

loadings_efa <- data.frame(EFA.result$scores)
# print(EFA.result$loadings, cutoff=0.32)
# write.table(loadings_efa, "EFA_matrix.tsv", sep = '\t')

Structural Equation Modelling (SEM)

Structural equation modeling (SEM) is a form of causal modeling. It is a multivariate statistical analysis technique that is used to analyze structural relationships. This technique is the combination of factor analysis and multiple regression analysis, and it is used to analyze the structural relationship between measured variables and latent constructs. This method is preferred by the researcher because it estimates the multiple and interrelated dependence in a single analysis. In this analysis, two types of variables are used: endogenous variables and exogenous variables. Endogenous variables are equivalent to dependent variables and are equal to the independent variable. Read more here and here.

Confirmatory Factor Analysis

A usual methodology for model evaluation is Confirmatory Factor Analysis (CFA) that is a particular case of SEM. It is a process which consists in specifying quantity and kinds of observed variables to one or more latent variables and analyze how well those variables measure the latent variable itself. Think of a latent variable as an artificial variable that is represented as a linear combination of observed variables.

Using Lavaan to obtain parameters estimates

Compared to AMOS and Stata, R lavaan is quite flexible. Read more [here].

SEM in LAVAAN

We defined four related theoretical scenarios and modelled them in SEM to predict interrelationships between exposures, the microbiome and host inflammation.

  • Framework 1: the gut microbiome mediates the exposure-inflammation relationship
  • Framework 2: the gut microbiome influences pathophysiology but is not influenced by risk exposures
  • Framework 3: the gut microbiome is influenced by exposures but does not influence pathophysiology
  • Framework 4: exposure risk factors are associated with inflammatory profile, and inflammatory profile predicts gut microbiome composition.

These models were fittet in SEM using MLR: maximum likelihood estimation with robust (Huber-White) standard errors and bootstrapping (asymptotically equaivalent to the Yuan-Bentler test statistic) to account for data non-normality, presence of dichotomous and categorical variables, and correction for multiple hypothesis testing

df <- column_to_rownames(metadata, var = "ID")
dat.sem.std <- data.frame(scale(df, center=TRUE, scale=TRUE))

model1 <-'
  # define latent variables
    IBF1 =~ IL4 + TNFalpha + IL12p70
    IBF2 =~ CRP + IL18 + IL12p40 + MCP1
    IBF3 =~ IL13 + IL15 + IFN_gamma + IL1beta
  # regressions
    SML1 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML2 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML3 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML4 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML5 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML6 ~ Age + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF1 ~ Age + Gender + Site + SML1+ SML2 + SML3 +SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF2 ~ Age + Gender + Site + SML1+ SML2 +SML3 + SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol    
    IBF3 ~ Age + Gender + Site + SML1+ SML2 +SML3 + SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
'

model2 <-'
  # define latent variables
    IBF1 =~ IL4 + TNFalpha + IL12p70
    IBF2 =~ CRP + IL18 + IL12p40 + MCP1
    IBF3 =~ IL13 + IL15 + IFN_gamma + IL1beta
  # regressions
    IBF1 ~ Age  + Gender + Site + SML1+ SML2 +SML3 + SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF2 ~ Age  + Gender + Site + SML1+ SML2 +SML3 + SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF3 ~ Age  + Gender + Site + SML1+ SML2 +SML3 + SML6 + SML4 + SML5 + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
'

model3 <-'
  # define latent variables
    IBF1 =~ IL4 + TNFalpha + IL12p70
    IBF2 =~ CRP + IL18 + IL12p40 + MCP1
    IBF3 =~ IL13 + IL15 + IFN_gamma + IL1beta
  # regressions
    SML1 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol 
    SML2 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML3 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML6 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML4 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    SML5 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF1 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF2 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF3 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
'

model4 <-'
  # define latent variables
    IBF1 =~ IL4 + TNFalpha + IL12p70
    IBF2 =~ CRP + IL18 + IL12p40 + MCP1
    IBF3 =~ IL13 + IL15 + IFN_gamma + IL1beta
  # regressions
    SML1 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    SML2 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    SML3 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    SML6 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    SML4 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    SML5 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol + IBF1 + IBF2 + IBF3
    IBF1 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF2 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
    IBF3 ~ Age  + Gender + Site + Fruits + Vegetables + Takeaway + Sugar_drinks + Seafood + Smoking_cig + Alcohol
'

fit.model1 <- sem(model = model1, dat.sem.std, estimator = "MLR", test = "bootstrap", likelihood = "wishart", se="robust.huber.white")
fit.model2 <- sem(model = model2, dat.sem.std, estimator = "MLR", test = "bootstrap", likelihood = "wishart", se="robust.huber.white")
fit.model3 <- sem(model = model3, dat.sem.std, estimator = "MLR", test = "bootstrap", likelihood = "wishart", se="robust.huber.white")
fit.model4 <- sem(model = model4, dat.sem.std, estimator = "MLR", test = "bootstrap", likelihood = "wishart", se="robust.huber.white")

compare.fit <- compareFit(
  fit.model1,
  fit.model2,
  fit.model3,
  fit.model4,
  nested=F,
  argsLRT=list(asymptotic=F))

How do the models compare

compare.fit %>%
  kableExtra::kable() %>%
    kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
chisq.scaled df.scaled pvalue.scaled cfi.robust tli.robust aic bic rmsea.robust srmr
fit.model1 390.959 102 0.058† 0.926† 0.707 2751.223† 2809.979† 0.061† 0.044†
fit.model2 392.654 107 0.048 0.859 0.67 4493.188 4808.521 0.087 0.048
fit.model3 386.832 107 0.010 0.762 0.565 4517.393 4804.329 0.087 0.048
fit.model4 264.986† 110 0.000 0.737 0.529 4528.990 4743.793 0.090 0.067

Plot the SEM best model as heatmaps

################## Heatmap exposures vs pathophysiology
parameter.estimates.df <- data.frame(parameterEstimates(fit.model1, standardized=TRUE) %>% 
    # filter(op == "=~" | op == "~") %>%
    dplyr::select('Latent.Factor'=lhs, Indicator=rhs, B=est, SE=se, Z=z, P=pvalue, Beta=std.all) %>% 
    # filter(P <= 0.1)%>%
    filter(grepl("SML",Latent.Factor))%>%
    filter(!grepl("SML",Indicator))%>%
    arrange(P) %>% na.omit())

parameter.estimates.df$Beta <- round(parameter.estimates.df$Beta, digits = 2)
parameter.estimates.df$Beta1 <- ifelse(parameter.estimates.df$Beta<0.1& parameter.estimates.df$Beta>-0.1, 0 , parameter.estimates.df$Beta) ## scale everything below 0.1
parameter.estimates.df$P <- round(parameter.estimates.df$P, digits = 4)
parameter.estimates.df$pvalue=car::Recode(parameter.estimates.df$P, "lo:0.001 = '***'; 0.001:0.05 = '**'; 0.05:0.1 = '*'; else = ' ';")
parameter.estimates.df <- parameter.estimates.df[,-c(3:6)]
parameter.estimates.df$Latent.Factor <- factor(parameter.estimates.df$Latent.Factor, 
                                               levels = rev(c("SML1","SML2","SML3","SML4","SML5","SML6")))
parameter.estimates.df$Indicator <- factor(parameter.estimates.df$Indicator, 
                       levels = c("Age","BMI","Gender","Site","Fruits","Vegetables","Takeaway",
                                      "Sugar_drinks","Seafood","Smoking_cig","Alcohol"))

patho <- c("IBF1","IBF2", "IBF3")

parameter.estimate.patho <- data.frame(parameterEstimates(fit.model1, standardized=TRUE) %>% 
    # filter(op == "=~" | op == "~") %>%
    dplyr::select('Latent.Factor'=lhs, Indicator=rhs, B=est, SE=se, Z=z, P=pvalue, Beta=std.all) %>% 
    # filter(P <= 0.1)%>%
    subset(Latent.Factor  %in% patho)%>%
    filter(!grepl("SML",Indicator))%>%
    subset(!(Indicator %in% patho))%>%  
    arrange(P) %>% na.omit())

parameter.estimate.patho$Beta <- round(parameter.estimate.patho$Beta, digits = 2)
parameter.estimate.patho$Beta1 <- ifelse(parameter.estimate.patho$Beta<0.1& parameter.estimate.patho$Beta>-0.1, 0 , parameter.estimate.patho$Beta)
parameter.estimate.patho$P <- round(parameter.estimate.patho$P, digits = 4)
parameter.estimate.patho$pvalue=car::Recode(parameter.estimate.patho$P, "lo:0.001 = '***'; 0.001:0.05 = '**'; 0.05:0.1 = '*'; else = ' ';")
parameter.estimate.patho <- parameter.estimate.patho[,-c(3:6)]
parameter.estimate.patho$Latent.Factor <- factor(parameter.estimate.patho$Latent.Factor, levels = rev(c("IBF1","IBF2", "IBF3")))
parameter.estimate.patho$Indicator <- factor(parameter.estimate.patho$Indicator, 
                       levels = c("Age","BMI","Gender","Site","Fruits","Vegetables","Takeaway",
                                      "Sugar_drinks","Seafood","Smoking_cig","Alcohol"))

ggplot(rbind(parameter.estimates.df,na.omit(parameter.estimate.patho)), aes(y=Latent.Factor,x=Indicator, fill = Beta)) + 
  geom_tile(aes(fill = Beta,width=.95, height=.95),colour = "white") +
  # geom_text(size = 10, aes(label = as.character(pvalue), vjust = 0.5)) +
  theme_bw() +
  scale_fill_gradient2(low = "red2", mid = "white", high = "darkgreen", 
                       midpoint = 0, space = "Lab", name="Standardized\nestimates\n(Beta)",
                       breaks = c(1,0.5, 0, -0.5, -1), limits = c(-1,1)) +
  xlab(NULL) + 
  ylab(NULL) + 
  theme(#axis.text.x=element_blank(),
        #axis.text.y=element_blank(),
        legend.text = element_text(size = 12),
        axis.ticks=element_blank(),
        panel.border=element_blank())+guides(fill = guide_legend(title.position = "top"))

And the latents for Biomarker Factors 1-3

Mediation analysis

abundance <- abundance.in %>% column_to_rownames(.,var = "ID")
abundance <- abundance[rowSums(abundance < 0.01) <= 60, ]## greater than 0.01 relab in 40% of the samples
abundance <- sweep(abundance, 1, rowSums(abundance), '/')## rescale relative abundance to add up to 1 across all samples
abundance <- t(abundance)
abundance <- tibble::rownames_to_column(as.data.frame(abundance), var = "ID")

dat <- left_join(metadata, abundance, by = "ID")
## Biomarkers data that was transformed..
trans.biomark <- c("rbg","TNFalpha","CRP","HbA1cIFCC","IFN_gamma","IL10","IL12p40","IL12p70","IL13","IL15","IL17A","IL18","IL1beta","IL2","IL33","IL4","IL5","IL6","LBP","MCP1","MIP1alpha")
dat[trans.biomark] <- lapply(dat[trans.biomark],function(p) {log10(p+1)}) ##log biomarkers not normally distributed..

outcomes <- c("BMI","waist_h_r","dbp","rbg","sbp","Hypertension","Diabetes","TNFalpha","CRP","HbA1cIFCC","IFN_gamma","IL10","IL12p40","IL12p70","IL13","IL15","IL17A","IL18","IL1beta","IL2","IL33","IL4","IL5","IL6","LBP","MCP1","MIP1alpha", "MAP")
exposures <- c("Age","Fruits","Seafood","Gender","Smoking_cig","Sugar_drinks","Takeaway","Vegetables","Alcohol","Site")

##arcsine transform bacteria

dat.arc <- dat ### arcsine transformation (also called the arcsine square root transformation, or the angular transformation)
dat.arc[,55:ncol(dat.arc)] <- lapply((dat.arc[,55:ncol(dat.arc)]),function(p) { asin(sqrt(p))})

#-----------------------MULTIPROCESS---------------------------------

# cores = detectCores()
# cl = makeCluster(cores[1]-1) #not to overload your computer
# registerDoParallel(cl)

# library(foreach) ## If using multicore processing
# 
# mediation.dat <-
#   foreach (i = 1:length(outcomes), .combine = rbind) %:%
#     foreach(j = 1:length(exposures), .combine='c', .inorder=FALSE) %:%
#       foreach(k = 55:ncol(dat.arc), .combine='c', .inorder=FALSE) %dopar% {
#         set.seed(1234)
#         outc <- outcomes[i]
#         name <- exposures[j]
#         dat.x <- na.omit(dat.arc[,c(names(dat.arc[k]),name, outc)])
#         dput(names(dat.x))
#         colnames(dat.x) <- c("x","y","z")
#         ## A fitted model object for mediator
#         med.fit <- lm(x ~ y, data = dat.x)
#         ## A fitted model object for outcomes
#         out.fit <- glm(z ~ x + y ,data = dat.x, family = gaussian)
#         ## Mediate with bootsrapping
#         med.out <- mediation::mediate(med.fit, out.fit, treat = "y", mediator = "x", sims = 1000, boot = TRUE)
#         #print(summary(med.out))
#         mediation:::print.summary.mediate(summary(med.out))
#         print(strrep("-", 62), quote = F, row.names = F)
#       }
# 
# sink("Mediation_arcsineAbundance.doc")
# mediation.dat
# sink()
# parallel::stopCluster(cl)
#---------------------------------------------------------------------
# sink("Mediation_arcsineAbundance.doc")
for (outc in outcomes){
  for (name in exposures){
    for (i in 47:ncol(dat.arc)){
      set.seed(1234)
      dat.x <- na.omit(dat.arc[,c(names(dat.arc[i]),name, outc)])
      dput(names(dat.x))
      colnames(dat.x) <- c("x","y","z")
      ## A fitted model object for mediator
      med.fit <- lm(x ~ y, data = dat.x)
      ## A fitted model object for outcomes
      out.fit <- glm(z ~ x + y ,data = dat.x, family = gaussian)
      ## Mediate with bootsrapping
      med.out <- mediate(med.fit, out.fit, treat = "y", mediator = "x", sims = 1000, boot = TRUE)
      #print(summary(med.out))
      mediation:::print.summary.mediate(summary(med.out))
      print(strrep("-", 62), quote = F, row.names = F)
    }
  # }
}

# sink()

Session information

date()
## [1] "Wed Jul 29 15:19:10 2020"
# sessionInfo()

……………….. THE END …………………

Author: Fredrick M. Mobegi, PhD
Created: 24-02-2018 Mon 0830h
Updated: 29-07-2020 Wed 1400h
Copyright © 2020 | This notebook is for research purposes only and use of the linked data must be accompanied with written permission. It contain embargoed data and/or links and references to embargoed data from Aboriginal and Torres Strait Islander Health Surveys in a program called the Well Persons Health Check (WPHC). In 2016, Torres and Cape Hospital and Health Service (TCHHS) collaborated with James Cook University (Zenadth Kes Health Partnership) to conduct these surveys in two Australian island communities, Waiben and Mer. Use of this data in our study was approved by the Far North Queensland Human Research Ethics Committee (HREC/16/QCH/70-1059) and received a written support from the local Community Council, Primary Health Care Service and TCHHS.
For permission requests, write to the Director of Microbiome Research Laboratory at SAHMRI.

…………………………………………….